home *** CD-ROM | disk | FTP | other *** search
- # SpamAssassin rules file: HTML tests
- #
- # Please don't modify this file as your changes will be overwritten with
- # the next update. Use @@LOCAL_RULES_DIR@@/local.cf instead.
- # See 'perldoc Mail::SpamAssassin::Conf' for details.
- #
- # <@LICENSE>
- # Copyright 2004 Apache Software Foundation
- #
- # Licensed under the Apache License, Version 2.0 (the "License");
- # you may not use this file except in compliance with the License.
- # You may obtain a copy of the License at
- #
- # http://www.apache.org/licenses/LICENSE-2.0
- #
- # Unless required by applicable law or agreed to in writing, software
- # distributed under the License is distributed on an "AS IS" BASIS,
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- # See the License for the specific language governing permissions and
- # limitations under the License.
- # </@LICENSE>
- #
- ###########################################################################
-
- require_version @@VERSION@@
-
- # HTML parser tests
- #
- # please sort these by eval type then name
-
- # HTML control test, HTML spam rules should all have better S/O than this
- body HTML_MESSAGE eval:html_test('html_message')
- describe HTML_MESSAGE HTML included in message
-
- # the HTML percentage range
- # should really be converted into a numeric function test
- body HTML_00_10 eval:html_range('ratio','0.00','0.10')
- body HTML_10_20 eval:html_range('ratio','0.10','0.20')
- body HTML_20_30 eval:html_range('ratio','0.20','0.30')
- body HTML_30_40 eval:html_range('ratio','0.30','0.40')
- body HTML_40_50 eval:html_range('ratio','0.40','0.50')
- body HTML_50_60 eval:html_range('ratio','0.50','0.60')
- body HTML_60_70 eval:html_range('ratio','0.60','0.70')
- body HTML_70_80 eval:html_range('ratio','0.70','0.80')
- body HTML_80_90 eval:html_range('ratio','0.80','0.90')
- body HTML_90_100 eval:html_range('ratio','0.90','1.00')
- describe HTML_00_10 Message is 0% to 10% HTML
- describe HTML_10_20 Message is 10% to 20% HTML
- describe HTML_20_30 Message is 20% to 30% HTML
- describe HTML_30_40 Message is 30% to 40% HTML
- describe HTML_40_50 Message is 40% to 50% HTML
- describe HTML_50_60 Message is 50% to 60% HTML
- describe HTML_60_70 Message is 60% to 70% HTML
- describe HTML_70_80 Message is 70% to 80% HTML
- describe HTML_80_90 Message is 80% to 90% HTML
- describe HTML_90_100 Message is 90% to 100% HTML
-
- # HTML shouting range
- # should really be converted into a numeric function test
- body HTML_SHOUTING3 eval:html_range('max_shouting','2','3')
- body HTML_SHOUTING4 eval:html_range('max_shouting','3','4')
- body HTML_SHOUTING5 eval:html_range('max_shouting','4','5')
- body HTML_SHOUTING6 eval:html_range('max_shouting','5','6')
- body HTML_SHOUTING7 eval:html_range('max_shouting','6','7')
- describe HTML_SHOUTING3 HTML has very strong "shouting" markup
- describe HTML_SHOUTING4 HTML has very strong "shouting" markup
- describe HTML_SHOUTING5 HTML has very strong "shouting" markup
- describe HTML_SHOUTING6 HTML has very strong "shouting" markup
- describe HTML_SHOUTING7 HTML has very strong "shouting" markup
-
- body HTML_TEXT_AFTER_HTML eval:html_test('text_after_html')
- describe HTML_TEXT_AFTER_HTML HTML contains text after HTML close tag
-
- body HTML_TEXT_AFTER_BODY eval:html_test('text_after_body')
- describe HTML_TEXT_AFTER_BODY HTML contains text after BODY close tag
-
- # HTML comment tests
- body HTML_COMMENT_SHORT eval:html_text_match('comment', '<!(?!-).{0,6}>')
- describe HTML_COMMENT_SHORT HTML comment is very short
-
- body HTML_COMMENT_SAVED_URL eval:html_text_match('comment', '<!-- saved from url=\(\d{4}\)')
- describe HTML_COMMENT_SAVED_URL HTML message is a saved web page
-
- # Comment is a spam sign when following <DIV>
- body HTML_CONVERTED eval:html_test('div_converted')
- describe HTML_CONVERTED HTML conversion tool used by spam
-
- body HTML_EMBEDS eval:html_test('embeds')
- describe HTML_EMBEDS HTML with embedded plugin object
-
- body HTML_EVENT_UNSAFE eval:html_test('html_event_unsafe')
- describe HTML_EVENT_UNSAFE HTML contains unsafe auto-executing code
-
- body HTML_FONT_SIZE_TINY eval:html_eval('min_size', '< 1')
- describe HTML_FONT_SIZE_TINY HTML font size is tiny
-
- body HTML_FONT_SIZE_NONE eval:html_eval('min_size', '< 0')
- describe HTML_FONT_SIZE_NONE HTML font size is negative
-
- body HTML_FONT_SIZE_LARGE eval:html_range('max_size', '5', '6')
- describe HTML_FONT_SIZE_LARGE HTML font size is large
-
- body HTML_FONT_SIZE_HUGE eval:html_range('max_size', '6', 'inf')
- describe HTML_FONT_SIZE_HUGE HTML font size is huge
-
- body HTML_FONT_BIG eval:html_test('big_font')
- describe HTML_FONT_BIG HTML tag for a big font size
-
- body HTML_FONT_TINY eval:html_test('tiny_font')
- describe HTML_FONT_TINY HTML tag for a tiny font size
-
- body HTML_FONT_INVISIBLE eval:html_test('font_invisible')
- describe HTML_FONT_INVISIBLE HTML font color is same as background
-
- body HTML_FONT_LOW_CONTRAST eval:html_test('font_low_contrast')
- describe HTML_FONT_LOW_CONTRAST HTML font color similar to background
-
- body HTML_FONT_FACE_BAD eval:html_test('font_face_bad')
- describe HTML_FONT_FACE_BAD HTML font face is not a word
-
- body HTML_FONT_FACE_CAPS eval:html_test('font_face_caps')
- describe HTML_FONT_FACE_CAPS HTML font face has excess capital characters
-
- body HTML_FORMACTION_MAILTO eval:html_test('form_action_mailto')
- describe HTML_FORMACTION_MAILTO HTML includes a form which sends mail
-
- # HTML_IMAGE_ONLY - not much raw HTML with images (absolute)
- body HTML_IMAGE_ONLY_04 eval:html_image_only('0000','0400')
- body HTML_IMAGE_ONLY_08 eval:html_image_only('0400','0800')
- body HTML_IMAGE_ONLY_12 eval:html_image_only('0800','1200')
- body HTML_IMAGE_ONLY_16 eval:html_image_only('1200','1600')
- body HTML_IMAGE_ONLY_20 eval:html_image_only('1600','2000')
- body HTML_IMAGE_ONLY_24 eval:html_image_only('2000','2400')
- describe HTML_IMAGE_ONLY_04 HTML: images with 0-400 bytes of words
- describe HTML_IMAGE_ONLY_08 HTML: images with 400-800 bytes of words
- describe HTML_IMAGE_ONLY_12 HTML: images with 800-1200 bytes of words
- describe HTML_IMAGE_ONLY_16 HTML: images with 1200-1600 bytes of words
- describe HTML_IMAGE_ONLY_20 HTML: images with 1600-2000 bytes of words
- describe HTML_IMAGE_ONLY_24 HTML: images with 2000-2400 bytes of words
-
- # HTML_IMAGE_RATIO - more image area than text (ratio)
- body HTML_IMAGE_RATIO_02 eval:html_image_ratio('0.000','0.002')
- body HTML_IMAGE_RATIO_04 eval:html_image_ratio('0.002','0.004')
- body HTML_IMAGE_RATIO_06 eval:html_image_ratio('0.004','0.006')
- body HTML_IMAGE_RATIO_08 eval:html_image_ratio('0.006','0.008')
- describe HTML_IMAGE_RATIO_02 HTML has a low ratio of text to image area
- describe HTML_IMAGE_RATIO_04 HTML has a low ratio of text to image area
- describe HTML_IMAGE_RATIO_06 HTML has a low ratio of text to image area
- describe HTML_IMAGE_RATIO_08 HTML has a low ratio of text to image area
-
- body HTML_LINK_PUSH_HERE eval:html_text_match('anchor', '(?i)(?:push|go)\s*(?:here|this)')
- describe HTML_LINK_PUSH_HERE HTML link text says "push here" or similar
-
- # HTML obfuscation
- body HTML_OBFUSCATE_05_10 eval:html_range('obfuscation_ratio','.05','.1')
- body HTML_OBFUSCATE_10_20 eval:html_range('obfuscation_ratio','.1','.2')
- body HTML_OBFUSCATE_20_30 eval:html_range('obfuscation_ratio','.2','.3')
- body HTML_OBFUSCATE_30_40 eval:html_range('obfuscation_ratio','.3','.4')
- body HTML_OBFUSCATE_40_50 eval:html_range('obfuscation_ratio','.4','.5')
- body HTML_OBFUSCATE_50_60 eval:html_range('obfuscation_ratio','.5','.6')
- body HTML_OBFUSCATE_60_70 eval:html_range('obfuscation_ratio','.6','.7')
- body HTML_OBFUSCATE_70_80 eval:html_range('obfuscation_ratio','.7','.8')
- body HTML_OBFUSCATE_80_90 eval:html_range('obfuscation_ratio','.8','.9')
- body HTML_OBFUSCATE_90_100 eval:html_range('obfuscation_ratio','.9','1.0')
- describe HTML_OBFUSCATE_05_10 Message is 5% to 10% HTML obfuscation
- describe HTML_OBFUSCATE_10_20 Message is 10% to 20% HTML obfuscation
- describe HTML_OBFUSCATE_20_30 Message is 20% to 30% HTML obfuscation
- describe HTML_OBFUSCATE_30_40 Message is 30% to 40% HTML obfuscation
- describe HTML_OBFUSCATE_40_50 Message is 40% to 50% HTML obfuscation
- describe HTML_OBFUSCATE_50_60 Message is 50% to 60% HTML obfuscation
- describe HTML_OBFUSCATE_60_70 Message is 60% to 70% HTML obfuscation
- describe HTML_OBFUSCATE_70_80 Message is 70% to 80% HTML obfuscation
- describe HTML_OBFUSCATE_80_90 Message is 80% to 90% HTML obfuscation
- describe HTML_OBFUSCATE_90_100 Message is 90% to 100% HTML obfuscation
-
- # backhair - idea from backhair set by Jennifer Wheeler and Adam Lopresto.
- body HTML_BACKHAIR_2 eval:html_range('backhair_count', '1', '4')
- body HTML_BACKHAIR_4 eval:html_range('backhair_count', '4', '8')
- body HTML_BACKHAIR_8 eval:html_range('backhair_count', '8', 'inf')
- describe HTML_BACKHAIR_2 HTML tags used to obfuscate words
- describe HTML_BACKHAIR_4 HTML tags used to obfuscate words
- describe HTML_BACKHAIR_8 HTML tags used to obfuscate words
-
- # HTML attribute testing
- body HTML_ATTR_BAD eval:html_range('attr_bad','0.75','1.0')
- describe HTML_ATTR_BAD HTML has many bad attributes in tags
- body HTML_ATTR_UNIQUE eval:html_range('attr_unique_bad','0.5','1.0')
- describe HTML_ATTR_UNIQUE HTML appears to have random attributes in tags
-
- body HTML_WEB_BUGS eval:html_test('web_bugs')
- describe HTML_WEB_BUGS Image tag intended to identify you
-
- body HTML_TAG_BALANCE_BODY eval:html_tag_balance('body', '!= 0')
- describe HTML_TAG_BALANCE_BODY HTML has unbalanced "body" tags
-
- body HTML_TAG_BALANCE_HEAD eval:html_tag_balance('head', '!= 0')
- describe HTML_TAG_BALANCE_HEAD HTML has unbalanced "head" tags
-
- body HTML_TAG_EXIST_MARQUEE eval:html_tag_exists('marquee')
- describe HTML_TAG_EXIST_MARQUEE HTML has "marquee" tag
-
- body HTML_TAG_EXIST_TBODY eval:html_tag_exists('tbody')
- describe HTML_TAG_EXIST_TBODY HTML has "tbody" tag
-
- # percentage of tags that are not legal elements in HTML
- body HTML_BADTAG_00_10 eval:html_range('bad_tag_ratio','0.00','0.10')
- body HTML_BADTAG_10_20 eval:html_range('bad_tag_ratio','0.10','0.20')
- body HTML_BADTAG_20_30 eval:html_range('bad_tag_ratio','0.20','0.30')
- body HTML_BADTAG_30_40 eval:html_range('bad_tag_ratio','0.30','0.40')
- body HTML_BADTAG_40_50 eval:html_range('bad_tag_ratio','0.40','0.50')
- body HTML_BADTAG_50_60 eval:html_range('bad_tag_ratio','0.50','0.60')
- body HTML_BADTAG_60_70 eval:html_range('bad_tag_ratio','0.60','0.70')
- body HTML_BADTAG_70_80 eval:html_range('bad_tag_ratio','0.70','0.80')
- body HTML_BADTAG_80_90 eval:html_range('bad_tag_ratio','0.80','0.90')
- body HTML_BADTAG_90_100 eval:html_range('bad_tag_ratio','0.90','1.00')
- describe HTML_BADTAG_00_10 HTML message is 0% to 10% bad tags
- describe HTML_BADTAG_10_20 HTML message is 10% to 20% bad tags
- describe HTML_BADTAG_20_30 HTML message is 20% to 30% bad tags
- describe HTML_BADTAG_30_40 HTML message is 30% to 40% bad tags
- describe HTML_BADTAG_40_50 HTML message is 40% to 50% bad tags
- describe HTML_BADTAG_50_60 HTML message is 50% to 60% bad tags
- describe HTML_BADTAG_60_70 HTML message is 60% to 70% bad tags
- describe HTML_BADTAG_70_80 HTML message is 70% to 80% bad tags
- describe HTML_BADTAG_80_90 HTML message is 80% to 90% bad tags
- describe HTML_BADTAG_90_100 HTML message is 90% to 100% bad tags
-
- # percentage of unique non-elements in HTML
- body HTML_NONELEMENT_00_10 eval:html_range('non_element_ratio','0.00','0.10')
- body HTML_NONELEMENT_10_20 eval:html_range('non_element_ratio','0.10','0.20')
- body HTML_NONELEMENT_20_30 eval:html_range('non_element_ratio','0.20','0.30')
- body HTML_NONELEMENT_30_40 eval:html_range('non_element_ratio','0.30','0.40')
- body HTML_NONELEMENT_40_50 eval:html_range('non_element_ratio','0.40','0.50')
- body HTML_NONELEMENT_50_60 eval:html_range('non_element_ratio','0.50','0.60')
- body HTML_NONELEMENT_60_70 eval:html_range('non_element_ratio','0.60','0.70')
- body HTML_NONELEMENT_70_80 eval:html_range('non_element_ratio','0.70','0.80')
- body HTML_NONELEMENT_80_90 eval:html_range('non_element_ratio','0.80','0.90')
- body HTML_NONELEMENT_90_100 eval:html_range('non_element_ratio','0.90','1.00')
- describe HTML_NONELEMENT_00_10 0% to 10% of HTML elements are non-standard
- describe HTML_NONELEMENT_10_20 10% to 20% of HTML elements are non-standard
- describe HTML_NONELEMENT_20_30 20% to 30% of HTML elements are non-standard
- describe HTML_NONELEMENT_30_40 30% to 40% of HTML elements are non-standard
- describe HTML_NONELEMENT_40_50 40% to 50% of HTML elements are non-standard
- describe HTML_NONELEMENT_50_60 50% to 60% of HTML elements are non-standard
- describe HTML_NONELEMENT_60_70 60% to 70% of HTML elements are non-standard
- describe HTML_NONELEMENT_70_80 70% to 80% of HTML elements are non-standard
- describe HTML_NONELEMENT_80_90 80% to 90% of HTML elements are non-standard
- describe HTML_NONELEMENT_90_100 90% to 100% of HTML elements are non-standard
-
- # short HTML messages with certain attributes
- body HTML_SHORT_LENGTH eval:html_eval('length', '< 170')
- describe HTML_SHORT_LENGTH HTML is extremely short
-
- body __HTML_LENGTH_512 eval:html_eval('length', '< 512')
- body __COMMENT_EXISTS eval:html_text_match('comment', '<!.*?>')
- meta HTML_SHORT_COMMENT (__HTML_LENGTH_512 && __COMMENT_EXISTS)
- describe HTML_SHORT_COMMENT HTML is very short with HTML comments
-
- body __HTML_LENGTH_384 eval:html_eval('length', '< 384')
- body __TAG_EXISTS_CENTER eval:html_tag_exists('center')
- meta HTML_SHORT_CENTER (__HTML_LENGTH_384 && __TAG_EXISTS_CENTER)
- describe HTML_SHORT_CENTER HTML is very short with CENTER tag
-
- body HTML_TITLE_EMPTY eval:html_text_not_match('title', '(?s)\S')
- describe HTML_TITLE_EMPTY HTML title contains no text
-
- body HTML_TITLE_UNTITLED eval:html_text_match('title', '(?i)(?:untitled|new page \d+)')
- describe HTML_TITLE_UNTITLED HTML title contains "Untitled"
-
- ###########################################################################
- # meta tests
-
- body __HTML_CHARSET_FARAWAY eval:html_charset_faraway()
- meta HTML_CHARSET_FARAWAY (__HTML_CHARSET_FARAWAY && __HIGHBITS)
- describe HTML_CHARSET_FARAWAY A foreign language charset used in HTML markup
- tflags HTML_CHARSET_FARAWAY userconf
-
- meta HTML_MIME_NO_HTML_TAG MIME_HTML_ONLY && !__TAG_EXISTS_HTML
- describe HTML_MIME_NO_HTML_TAG HTML-only message, but there is no HTML tag
-
- meta HTML_MISSING_CTYPE (!__MIME_HTML && HTML_MESSAGE)
- describe HTML_MISSING_CTYPE Message is HTML without HTML Content-Type
-
- ###########################################################################
- # rawbody HTML tests
-
- rawbody HIDE_WIN_STATUS /<[^>]+onMouseOver=[^>]+window\.status=/i
- describe HIDE_WIN_STATUS Javascript to hide URLs in browser
-
- rawbody __OBFUSCATING_COMMENT_A /\w(?:<![^>]*>)+\w/
- rawbody __OBFUSCATING_COMMENT_B /[^\s>](?:<![^>]*>)+[^\s<]/
- meta OBFUSCATING_COMMENT ((__OBFUSCATING_COMMENT_A && HTML_MESSAGE) || (__OBFUSCATING_COMMENT_B && MIME_HTML_ONLY))
- describe OBFUSCATING_COMMENT HTML comments which obfuscate text
-
- # spams that are assembled from a Javascript array
- # look for the XOR op
- rawbody __JS_FROMCHARCODE /String\.fromCharCode\s*\(\s*\S+\s*\[\s*\S+\s*\]\s*\^/
- rawbody __JS_DOCWRITE /document\.write/
- meta JS_FROMCHARCODE (__JS_FROMCHARCODE && __JS_DOCWRITE)
- describe JS_FROMCHARCODE Document is built from a Javascript charcode array
-
- # A-Z, a-z, 0-9
- rawbody ENTITY_DEC_ALPHANUM /\&\#0*(?:4[89]|5[0-7]|6[5-9][78]\d|9[0789]|1[01]\d|12[012])\;/
- describe ENTITY_DEC_ALPHANUM HTML contains needlessly encoded characters
-
- # ! $ % ' ( ) , - . / : ; = ? @ _
- # a good possible rule that may resurface
- #rawbody ENTITY_DEC_OTHER /\&\#0*(?:3[3679]|4[014567]|5[89]|6[134]|95)\;/
- #describe ENTITY_DEC_OTHER HTML contains needlessly encoded punctuation
-